Exploratory Data Analysis - Property Prices Hunter NSW

In this project, we will be working with data scraped from an Australian digital property portal and perform some initial analysis to get an idea of property prices in the Hunter NSW region.

Setup and The Dataset

In [1]:
# setup
import pandas as pd
import numpy as np
import folium
from sqlalchemy import create_engine
db_string = connection_string
db = create_engine(db_string)
In [2]:
# reading table from database
properties = pd.read_sql_table("NewNewHouseScrape", con=db)
In [3]:
properties.head()
Out[3]:
index scrape_date street postcode suburb state bed bath parking size price prop_type inspection listing_tag listing_details_features listing_details_description url listing_details_insights
0 0 2020-06-08 15:19:21.935540 7 Noreen Place, 2298 WARATAH WEST NSW 2 Beds 1 Bath 1 Parking 594m² $395,000 to $425,000 House Sat 13 Jun, 10:00am New Property DescriptionDive into Home Ownership i... None None
1 1 2020-06-08 15:19:22.384442 None None
2 2 2020-06-08 15:19:22.385443 17 Grandview Place, 2431 SOUTH WEST ROCKS NSW 842m² $330,000 Vacant land New Property DescriptionDream Location - Views - I... None None
3 3 2020-06-08 15:19:22.751956 Lot 4 Glen Close, 2447 MACKSVILLE NSW 4 Beds 2 Baths 2 Parking 801m² From $360,460 New House & Land New Property DescriptionSKYPE AND ZOOM APPOINTMENT... None None
4 4 2020-06-08 15:19:23.089815 66 Katoomba Avenue, 2262 SAN REMO NSW 4 Beds 2 Baths 1 Parking 575m² New To Market House Sat 13 Jun, 10:00am New Property DescriptionMODERN LUXURY MEETS ENTERT... None None
In [4]:
properties.shape
Out[4]:
(34309, 18)

Data Cleaning

In [5]:
# removing columns we won't need for this analysis
properties.drop(["index", "inspection", "listing_tag", "listing_details_features",
                 "listing_details_description", "url", "listing_details_insights"],
                 axis=1, inplace=True)
In [6]:
# removing empty rows and rows with missing key details (address) from our dataframe
properties = properties[properties["postcode"] != ""]
properties.reset_index(drop=True, inplace=True)

Cleaning Street Column

In [7]:
# copying original street column for debugging
# properties.insert(loc=1, column="street_og", value=properties["street"])
In [8]:
# converting all text in street column to lowercase
properties["street"] = properties["street"].str.lower()
In [9]:
# streamlining street suffix format
suffix_dictionary = {"st,": "street",
                     "cr,": "crescent",
                     "cl,": "close",
                     "cct,": "circuit",
                     "ct,": "circuit",
                     "dr,": "drive",
                     "rd,": "road",
                     "ave,": "avenue",
                     "blvd": "boulevard"}

for key, value in suffix_dictionary.items():
    properties["street"] = properties["street"].str.replace(key, value)
In [10]:
# removing any unwanted characters
properties["street"] = properties["street"].str.replace('"',"'")
In [11]:
pattern = r"[.*(),]"
properties["street"] = properties["street"].str.replace(pattern, "")
In [12]:
# removing any unwanted white space
properties["street"] = properties["street"].str.replace("  ", " ").str.strip()
In [13]:
# manual cleaning of errors encountered in the dataset
properties["street"] = properties["street"].str.replace("pacfic", "pacific")

Creating Street Number Column

In [14]:
properties.insert(loc=2, column="street_number", value=properties["street"])
pattern_one = r"\s([a-z]+)"
properties["street_number"] = properties["street_number"].str.replace(pattern_one, "")
pattern_two = r"[a-z]{2,}"
properties["street_number"] = properties["street_number"].str.replace(pattern_two, "")
properties["street_number"] = properties["street_number"].str.replace("'", "").str.strip()

Creating Street Name Column

In [15]:
properties.insert(loc=3, column="street_name", value=properties["street"])
pattern_three = r"\d+"
properties["street_name"] = properties["street_name"].str.replace(pattern_three, "")
properties["street_name"] = properties["street_name"].str.replace("lots?", "")
pattern_four = r"[&/-]"
properties["street_name"] = properties["street_name"].str.replace(pattern_four, "")
pattern_five = r"\b[b-n,p-z]{1,2}\b"
properties["street_name"] = properties["street_name"].str.replace(pattern_five, "")
pattern_six = r"\b[ab]\b"
properties["street_name"] = properties["street_name"].str.replace(pattern_six, "").str.strip()

Creating Clean Street Column

In [16]:
properties.insert(loc=4, column="clean_street", 
                  value=properties["street_number"] + " " + properties["street_name"])

Removing Original Street Column

In [17]:
properties.drop(["street"], axis=1, inplace=True)
In [18]:
properties.sample(5)
Out[18]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking size price prop_type
8197 2020-06-20 07:45:56.932087 79/3a parkside parade 79/3a parkside parade 2283 TORONTO NSW 2 Beds 2 Baths 2 Parking Contact Agent Apartment / Unit / Flat
19801 2020-08-07 14:26:13.280925 59 davoren drive 59 davoren drive 2444 PORT MACQUARIE NSW 2 Beds 1 Bath 1 Parking $400,000 Retirement Living
26736 2020-09-06 12:22:35.804778 48 watkins road 48 watkins road 2267 WANGI WANGI NSW 3 Beds 3 Baths 4 Parking 925m² $749,000 House
7894 2020-06-20 07:43:27.745332 8 cotswolds close 8 cotswolds close 2260 TERRIGAL NSW 4 Beds 2 Baths 2 Parking 571m² Just Listed House
14170 2020-07-04 08:36:45.697911 39 shiraz street 39 shiraz street 2333 MUSWELLBROOK NSW 5 Beds 3 Baths 2 Parking 787m² Buyer Guide $440,000 - $460,000 House

Cleaning Size Column

Some listings in our dataset (mainly the ones advertising land) don't have entries for the number of bedrooms, bathrooms and parking spots, and have been scraped with the property size listed in the bedroom column while the size column is empty.

In [19]:
# examples:
properties[properties["prop_type"] == "Vacant land"].sample(3)
Out[19]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking size price prop_type
9934 2020-06-24 18:43:23.298964 2a averys rise stage 2a averys rise stage 2321 HEDDON GRETA NSW 630m² PRICE RANGE: $185,000 - $205,000 Vacant land
11763 2020-06-30 14:27:12.491067 16 ayes avenue 16 ayes avenue 2285 CAMERON PARK NSW 600m² $310,000 - $340,000 Vacant land
21576 2020-08-15 11:20:41.975209 530 helena village 530 helena village 2321 LOCHINVAR NSW 900m² PREVIEW Vacant land

Let's correct this by creating a new size column which pulls the size from the bed column where required and from the size column otherwise.

In [20]:
size_update_list = []

for i, row in properties.iterrows():
    if "m" in row["bed"]:
        size_update_list.append(row["bed"])
    elif "ha" in row["bed"]:
        size_update_list.append(row["bed"])
    else:
        size_update_list.append(row["size"])
        
size_updated = pd.DataFrame(size_update_list, columns=["size_updated"])
properties = pd.concat([properties, size_updated], axis=1)
In [21]:
properties.sample(3)
Out[21]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking size price prop_type size_updated
16598 2020-07-20 16:22:40.462054 4/40 little street 4/40 little street 2428 FORSTER NSW 2 Beds 1 Bath 1 Parking $420,000 Apartment / Unit / Flat
12887 2020-07-01 20:20:55.824937 13 lutana street 13 lutana street 2285 EDGEWORTH NSW 4 Beds 2 Baths 2 Parking 613m² $410,000 - $450,000 House 613m²
13254 2020-07-01 20:24:31.463557 4/33a church street 4/33a church street 2330 SINGLETON NSW 4 Beds 2 Baths 2 Parking $435,000 Townhouse
In [22]:
# creating a copy of the original size_updated column for debugging
# properties["size_og"] = properties["size_updated"]
In [23]:
# removing special characters
properties["size_updated"] = properties["size_updated"].str.replace(",", "").str.strip()
In [24]:
# removing the measurement unit for all entries in square metres
properties["size_updated"] = properties["size_updated"].str.replace("m²", "")
In [25]:
# creating a new column for the cleaned size values
pattern = r"ha"
clean_size = []
for value in properties["size_updated"]:
    
    # converting hectar entries to square metres and removing the measurement unit
    if pattern in value:
        value = value.replace("ha", "")
        value = float(value)
        value = value * 10000
        clean_size.append(value)
    
    else:
        clean_size.append(value)

properties["clean_size"] = clean_size
In [26]:
# converting the values in the clean_size column to numerical form
properties["clean_size"] = pd.to_numeric(properties["clean_size"], errors="coerce")
In [27]:
# dropping the original size and updated_size columns
properties.drop(["size", "size_updated"], axis=1, inplace=True)
In [28]:
properties.sample(5)
Out[28]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking price prop_type clean_size
2772 2020-06-11 14:25:20.657347 510/265 sandy point road 510/265 sandy point road 2317 SALAMANDER BAY NSW 3 Beds 3 Baths 1 Parking $489,000 Townhouse NaN
11314 2020-06-26 17:26:02.238059 8 hopetoun street 8 hopetoun street 2260 FORRESTERS BEACH NSW 5 Beds 3 Baths 2 Parking Under Contract House 898.0
12514 2020-07-01 20:16:58.884810 3/541 the entrance road 3/541 the entrance road 2261 BATEAU BAY NSW 2 Beds 1 Bath − Parking Preview House NaN
17437 2020-07-26 09:16:17.013913 525 turnberry avenue 525 turnberry avenue 2325 CESSNOCK NSW 4 Beds 2 Baths 2 Parking $539,414 New House & Land 630.0
14664 2020-07-15 21:00:00.151509 80 belgrave street 80 belgrave street 2440 KEMPSEY NSW 1 Bed 1 Bath − Parking $230,000 House NaN

Cleaning Bed, Bath and Parking Columns

In [29]:
# creating a copy of the original columns for debugging
# properties["og_bed"] = properties["bed"]
# properties["og_bath"] = properties["bath"]
# properties["og_parking"] = properties["parking"]
In [30]:
# removing size values from bed column
pattern_x = r".+ha"
pattern_y = r".+m²"
properties["bed"] = properties["bed"].str.replace(pattern_x, "")
properties["bed"] = properties["bed"].str.replace(pattern_y, "")
In [31]:
# removing any letters
pattern_1 = r"[a-z]+"
columns = ["bed", "bath", "parking"]

for column in columns:
    properties[column] = properties[column].str.lower().str.replace(pattern_1, "")
In [32]:
# additional cleaning for the bed column
pattern_2 = r"[.,−]"
pattern_3 = r"\d{3,}"
properties["bed"] = properties["bed"].str.replace(pattern_2, "")
properties["bed"] = properties["bed"].str.replace(pattern_3, "").str.strip()
In [33]:
# additional cleaning for the bath column
properties["bath"] = properties["bath"].str.replace("−", "").str.strip()
In [34]:
# additional cleaning for the parking column
properties["parking"] = properties["parking"].str.replace("−", "0").str.strip()
In [35]:
# converting the clean values to numerical form
for column in columns:
    properties[column] = pd.to_numeric(properties[column], errors="coerce")
In [36]:
properties.sample(5)
Out[36]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking price prop_type clean_size
13433 2020-07-04 08:30:31.566855 74 rodgers street 74 rodgers street 2294 CARRINGTON NSW 3.0 1.0 0.0 $520,000 - $570,000 House NaN
5379 2020-06-15 18:54:04.506524 158/2 evans road 158/2 evans road 2263 CANTON BEACH NSW 1.0 1.0 1.0 $199,000 - $218,000 House NaN
21597 2020-08-15 11:20:54.652042 537 helena village 537 helena village 2321 LOCHINVAR NSW NaN NaN NaN PREVIEW Vacant land 1514.0
12221 2020-06-30 14:31:07.014006 82 fosterton road 82 fosterton road 2420 DUNGOG NSW 4.0 2.0 2.0 $399,000 House 1985.0
3527 2020-06-12 18:40:59.175137 4 decora crescent 4 decora crescent 2304 WARABROOK NSW 4.0 2.0 4.0 Auction House 682.0

Cleaning Price Column

In [37]:
# creating a copy of the price column for debugging
properties["price_og"] = properties["price"]
In [38]:
# converting all text in the price column to lowercase
properties["price"] = properties["price"].str.lower()
In [39]:
# creating a new column to capture auctions
pattern = r"auction"
auctions = properties["price"].str.contains(pattern)
properties["auction"] = auctions.astype(int)
In [40]:
# cleaning
properties["price"] = properties["price"].str.replace("park", "")
properties["price"] = properties["price"].str.replace("k ", "000")
pattern_a = r"\D+"
properties["price"] = properties["price"].str.replace(",", "").str.replace(" ", "")
properties["price"] = properties["price"].str.replace(pattern_a, " ").str.strip()
pattern_b = r"\b(00)\b"
properties["price"] = properties["price"].str.replace(pattern_two, "").str.strip()
properties["price"] = properties["price"].str.replace(r"2020", "").str.strip()
properties["price"] = properties["price"].str.replace(r"\b[0-9]{1,2}\b", "").str.strip()
properties["price"] = properties["price"].str.replace(r"^\s*$", "").str.strip()
pattern_c = r"(^04\d+)"
properties["price"] = properties["price"].str.replace(pattern_c, "").str.strip()
In [41]:
# creating price min and price max columns
properties["price_min"] = properties["price"].str.split(" ").str.get(0)
properties["price_max"] = properties["price"].str.split(" ").str.get(-1)
In [42]:
# converting price min and price max columns to numeric format
properties["price_min"] = pd.to_numeric(properties["price_min"], errors="coerce")
properties["price_max"] = pd.to_numeric(properties["price_max"], errors="coerce")
In [43]:
# dropping original price column
properties.drop(["price"], axis=1, inplace=True)
In [44]:
properties.sample(5)
Out[44]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max
9976 2020-06-24 18:43:43.853173 5 deane street 5 deane street 2280 BELMONT NSW 3.0 1.0 1.0 House 582.0 $570,000 - $610,000 0 570000.0 610000.0
2431 2020-06-11 14:22:23.457825 14 bulga street 14 bulga street 2324 PINDIMAR NSW NaN NaN NaN Vacant land 3383.0 $199,000 0 199000.0 199000.0
18324 2020-07-30 09:23:31.216473 812 midnight avenue 812 midnight avenue 2318 MEDOWIE NSW 4.0 2.0 2.0 New House & Land 599.0 $540,990 0 540990.0 540990.0
2672 2020-06-11 14:24:31.899851 17 gorrick street 17 gorrick street 2304 MAYFIELD EAST NSW 2.0 1.0 1.0 House 493.0 $629,950 0 629950.0 629950.0
10557 2020-06-26 17:19:54.950744 109/8 george street 109/8 george street 2250 GOSFORD NSW 1.0 1.0 0.0 Apartment / Unit / Flat NaN $395,000 0 395000.0 395000.0
In [45]:
# creating mean price column
properties["price_mean"] = (properties["price_min"] + properties["price_max"]) / 2
In [46]:
properties.sample(5)
Out[46]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
411 2020-06-08 15:22:49.161307 156/25 mulloway road 156/25 mulloway road 2259 CHAIN VALLEY BAY NSW 3.0 1.0 1.0 Retirement Living 291373.0 $349,000 0 349000.0 349000.0 349000.0
27859 2020-09-08 11:45:21.323403 34 alton close 34 alton close 2324 RAYMOND TERRACE NSW 3.0 1.0 1.0 House NaN $370,000 - $390,000 0 370000.0 390000.0 380000.0
3359 2020-06-12 18:39:38.925494 2/1b corona street 2/1b corona street 2304 MAYFIELD NSW 2.0 1.0 1.0 Apartment / Unit / Flat NaN $480,000 - $525,000 0 480000.0 525000.0 502500.0
6003 2020-06-16 19:02:32.925202 321/51-54 the esplanade 321/51-54 the esplanade 2257 ETTALONG BEACH NSW 2.0 2.0 0.0 Apartment / Unit / Flat NaN $420,000 - $460,000 0 420000.0 460000.0 440000.0
13237 2020-07-01 20:24:21.768176 7 fairmont boulevaroad 7 fairmont boulevaroad 2259 HAMLYN TERRACE NSW 5.0 2.0 2.0 House 450.0 Auction 1 NaN NaN NaN

Removing Duplicate Rows

The scraper does not check whether a property already exists in our database - each time it is run, it simply adds all properties it finds to the existing data. This means that each property in the dataset it likely to have multiple duplicate entries.

In [47]:
# shape of dataframe (including duplicates)
properties.shape
Out[47]:
(28131, 17)
In [48]:
# random example of duplication
properties[(properties["clean_street"] == "9 anvil street") & (properties["suburb"] == "GRETA")]
Out[48]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
14501 2020-07-15 20:58:19.972190 9 anvil street 9 anvil street 2334 GRETA NSW 3.0 1.0 1.0 House 1192.0 $345,000 - $365,000 0 345000.0 365000.0 355000.0
15541 2020-07-16 20:14:48.401991 9 anvil street 9 anvil street 2334 GRETA NSW 3.0 1.0 1.0 House 1192.0 $345,000 - $365,000 0 345000.0 365000.0 355000.0
16780 2020-07-20 16:24:28.515149 9 anvil street 9 anvil street 2334 GRETA NSW 3.0 1.0 1.0 House 1192.0 $345,000 - $365,000 0 345000.0 365000.0 355000.0

For the purpose of this initial analysis, we will first sort the dataframe by scrape_date in descending order, and then remove duplicates based on the clean_street and suburb columns, keeping only the first entry - which will correspond to the latest record due to the way in which we sort the data.

In [49]:
# sorting the data in descending order
properties = properties.sort_values("scrape_date", ascending=False)
In [50]:
# removing all duplicates and keeping only the first (latest) entry
properties = properties.drop_duplicates(subset=["clean_street", "suburb"])
In [51]:
# resetting index
properties.reset_index(drop=True, inplace=True)
In [52]:
# confirming everything worked as expected using the example we looked at above
# - only the most recent entry (20/07/2020) has been kept
properties[(properties["clean_street"] == "9 anvil street") & (properties["suburb"] == "GRETA")]
Out[52]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
4988 2020-07-20 16:24:28.515149 9 anvil street 9 anvil street 2334 GRETA NSW 3.0 1.0 1.0 House 1192.0 $345,000 - $365,000 0 345000.0 365000.0 355000.0
In [53]:
# shape of dataframe after removal of duplicates
properties.shape
Out[53]:
(8458, 17)

Evaluating the Cleaning

We have made great progress with our cleaning, however, given the messy scraped dataset we started off with, we expect there to still be issues - data entry related, scraper related, or cleaning related as the methods we applied to create our clean columns might not have worked for all entries.

The price column with its countless variations in which details were entered made it particularly difficult to account for all scenarios. Let's look at the columns relating to price to see if there is anything further we can do to improve data accuracy.

In [54]:
# min for the price_min column
print("Minimum: " + str(properties["price_min"].min()))
Minimum: 190.0
In [55]:
# entries with price_min values of less than 50000
properties[properties["price_min"] < 50000]
Out[55]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
513 2020-09-08 11:42:56.311359 1/7 banksia street 1/7 banksia street 2428 BLUEYS BEACH NSW 3.0 1.0 1.0 Duplex NaN $1.190m 0 190.0 190.0 190.0
821 2020-09-08 11:39:49.409939 3 rose way 3 rose way 2333 MUSWELLBROOK NSW NaN NaN NaN Vacant land 598.0 $35,000 0 35000.0 35000.0 35000.0
822 2020-09-08 11:39:48.441526 1 rose way 1 rose way 2333 MUSWELLBROOK NSW NaN NaN NaN Vacant land 448.0 $30,000 0 30000.0 30000.0 30000.0
1020 2020-09-06 12:25:27.354437 20 murray street 20 murray street 2469 RAPPVILLE NSW NaN NaN NaN Vacant land 1518.0 $40,000 0 40000.0 40000.0 40000.0
2806 2020-08-15 11:20:49.577529 113 proposed road 113 proposed road 2321 LOCHINVAR NSW 4.0 2.0 2.0 New House & Land 606.0 NOW $579 WAS $626 0 579.0 626.0 602.5
3040 2020-08-07 14:34:24.169075 11 carrera crescent 11 carrera crescent 2265 COORANBONG NSW 5.0 2.0 2.0 House 569.0 Only $15,000 Deposit 0 15000.0 15000.0 15000.0
3223 2020-08-07 14:32:19.054401 00 non urban land 00 non urban land 2324 NORTH ARM COVE NSW NaN NaN NaN Rural NaN FROM $12,000 0 12000.0 12000.0 12000.0
3326 2020-08-07 14:30:57.072664 pyrus avenue pyrus avenue 2335 BRANXTON NSW NaN NaN NaN New land 1984.0 Radford Park 3000m2 New Land Release Now Selling 0 3000.0 3000.0 3000.0
3558 2020-08-07 14:26:21.514390 2/265 sandgate road 2/265 sandgate road 2307 SHORTLAND NSW 3.0 1.0 1.0 Townhouse NaN $430 Per week 0 430.0 0.0 215.0
3672 2020-08-03 19:39:24.867058 2a george flemming road 2a george flemming road 2429 WINGHAM NSW NaN NaN NaN Vacant land NaN $190,00 - $205,000 0 19000.0 205000.0 112000.0
4120 2020-07-30 09:28:17.434542 12 dunmore road 12 dunmore road 2320 LARGS NSW NaN NaN NaN Acreage / Semi-Rural 12200.0 1.495 Million 0 495.0 495.0 495.0
4482 2020-07-26 09:19:35.162242 7 snedden street 7 snedden street 2291 MEREWETHER NSW 3.0 1.0 1.0 House 277.0 Online Auction - Guide from $1.425M 1 425.0 425.0 425.0
5019 2020-07-20 16:24:08.941421 63 forbes street 63 forbes street 2333 MUSWELLBROOK NSW NaN NaN NaN Vacant land 663.0 $45,000 0 45000.0 45000.0 45000.0
5916 2020-07-04 08:34:08.335657 2575 salisbury road via 2575 salisbury road via 2420 DUNGOG NSW NaN NaN NaN Rural 1012900.0 $1.295 Million 0 295.0 295.0 295.0
5988 2020-07-04 08:33:25.444469 19 peace parade 19 peace parade 2324 PINDIMAR NSW NaN NaN NaN Vacant land 835.0 $49,000 0 49000.0 49000.0 49000.0
6034 2020-07-04 08:33:02.605501 8 macartney street 8 macartney street 2329 MERRIWA NSW NaN NaN NaN Vacant land 2023.0 $48,000 0 48000.0 48000.0 48000.0
7312 2020-06-22 19:03:30.758565 281 bundabah road 281 bundabah road 2324 BUNDABAH NSW NaN NaN NaN Vacant land 1153.0 $49,000 0 49000.0 49000.0 49000.0
7315 2020-06-22 19:03:29.483116 1334 commerce way 1334 commerce way 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 908.0 $35,000 0 35000.0 35000.0 35000.0
7316 2020-06-22 19:03:29.107413 1335 commerce way 1335 commerce way 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 750.0 $35,000 0 35000.0 35000.0 35000.0
7317 2020-06-22 19:03:28.779923 1979 coonamble crescent 1979 coonamble crescent 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 1384.0 $16,500 0 16500.0 16500.0 16500.0
7318 2020-06-22 19:03:28.309365 1327 commerce way 1327 commerce way 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 614.0 $29,000 0 29000.0 29000.0 29000.0
7321 2020-06-22 19:03:26.000464 1765 the ridgeway 1765 the ridgeway 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 587.0 $39,000 0 39000.0 39000.0 39000.0
7322 2020-06-22 19:03:25.650743 14 & 15 the ridgeway 14 & 15 the ridgeway 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 668.0 $39,000 0 39000.0 39000.0 39000.0
7323 2020-06-22 19:03:25.234854 42 & 43 the ridgeway 42 & 43 the ridgeway 2324 NORTH ARM COVE NSW NaN NaN NaN Vacant land 669.0 $39,000 0 39000.0 39000.0 39000.0
7414 2020-06-20 07:48:15.650772 9 ashland close 9 ashland close 2320 RUTHERFORD NSW 3.0 1.0 2.0 House 594.0 $349,00-$379,000 0 34900.0 379000.0 206950.0
8221 2020-06-10 13:33:41.734502 6 abberton parkway 6 abberton parkway 2335 NORTH ROTHBURY NSW 4.0 2.0 4.0 House 626.0 $480,00-$510,000 0 48000.0 510000.0 279000.0

Looking at the summary of entries with minimum prices of less than $50,000, we observe that some of the low prices seem legitimate (i.e. small areas of vacant land), while others appear to be errors, either caused by data entry or scraping mistakes (i.e. forgetting a zero) or by cleaning errors (i.e. not accounting for decimal prices entered in millions).

How should we proceed?

The easiest option would be to simply remove any entry with minimum pricing below $65,000, but we would lose some legitimate entries and could potentially affect the pricing analysis for certain suburbs such as North Arm Cove, where vacant land appears to be particularly cheap. Another option would be to try and manually correct as many entries as possible, but the small improvements we could make don't justify the time spent on each individual entry - and being a live project with the scraper being run and data being added to the database every few days, this approach will be less and less practical.

For the purpose of this analysis, we find that removing any entries with a minimum price of less than $50,000 that are not vacant land appears to be the best option, however some incorrect entries would remain:

In [56]:
# example available at the time of assembling this project
properties[properties.index == 3672]
Out[56]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
3672 2020-08-03 19:39:24.867058 2a george flemming road 2a george flemming road 2429 WINGHAM NSW NaN NaN NaN Vacant land NaN $190,00 - $205,000 0 19000.0 205000.0 112000.0

Is there anything we can do to further improve our approach?

Looking at our example above, we notice a great discrepancy between the values for minimum and maximum price.

The price_min and price_max columns reflect the pricing guide for each property which is meant to give potential buyers an idea of the price range a listing falls into, meaning the gap between the two values should not be too large.

Looking at our data, we find a few entries where the price maximum is at least 1.5 times as high as the price minimum, and to no great surprise, all of these listings have questionable price_min or price_max entries.

In [57]:
properties[properties["price_max"] - (properties["price_min"] * 1.5) > 0]
Out[57]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
2503 2020-08-18 11:46:45.077562 3/108 broken bay road 3/108 broken bay road 2257 ETTALONG BEACH NSW 2.0 1.0 1.0 Villa NaN $600k - $660k 2 Doors to Beach ! 0 600000.0 6600002.0 3600001.0
3098 2020-08-07 14:33:44.172449 41 alina lockyer street 41 alina lockyer street 2289 ADAMSTOWN NSW 3.0 2.0 1.0 Townhouse NaN $840,000 to $875,0000 0 840000.0 8750000.0 4795000.0
3672 2020-08-03 19:39:24.867058 2a george flemming road 2a george flemming road 2429 WINGHAM NSW NaN NaN NaN Vacant land NaN $190,00 - $205,000 0 19000.0 205000.0 112000.0
4657 2020-07-26 09:18:00.555687 185 christo road 185 christo road 2298 WARATAH NSW 3.0 1.0 2.0 House 448.0 $575,00 - $625,000 0 57500.0 625000.0 341250.0
6373 2020-07-04 08:29:37.448702 62 havelock street 62 havelock street 2304 MAYFIELD NSW 3.0 1.0 1.0 House 303.0 $630,00 - $670,000 0 63000.0 670000.0 366500.0
7414 2020-06-20 07:48:15.650772 9 ashland close 9 ashland close 2320 RUTHERFORD NSW 3.0 1.0 2.0 House 594.0 $349,00-$379,000 0 34900.0 379000.0 206950.0
8221 2020-06-10 13:33:41.734502 6 abberton parkway 6 abberton parkway 2335 NORTH ROTHBURY NSW 4.0 2.0 4.0 House 626.0 $480,00-$510,000 0 48000.0 510000.0 279000.0

We decide to remove any entries with a minimum price of less than $50,000 that are not vacant land, and then to also remove any entries where the price_max value is at least 1.5 times as high as the price minimum.

In [58]:
# dropping properties with price_min < 50000 that are not vacant land
properties.drop(properties[(properties["price_min"] < 50000) & (properties["prop_type"] != "Vacant land")].index, axis=0, inplace=True)
In [59]:
# dropping rows with unreasonable pricing gap
properties.drop(properties[properties["price_max"] - (properties["price_min"] * 1.5) > 0].index, axis=0, inplace=True)
In [60]:
# resetting index
properties.reset_index(drop=True, inplace=True)
In [61]:
properties.shape
Out[61]:
(8442, 17)

Let's next take a look at the price_max column.

In [62]:
# min for the price_max column
print("Minimum: " + str(properties["price_max"].min()))
Minimum: 0.0

The minimum for the price_max column (0) is less than the minimum we originally found for the price_min column (190), wich reveals that we must have entries in the dataset where price_max < price_min:

In [63]:
properties[properties["price_max"] < properties["price_min"]]
Out[63]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
1760 2020-08-27 12:12:54.649520 6 collins close 6 collins close 2281 CAVES BEACH NSW 3.0 1.0 2.0 House 854.0 $950,000 to $1.045m 0 950000.0 45.0 475022.5
4501 2020-07-26 09:19:20.077973 16 prior circuit 16 prior circuit 2440 WEST KEMPSEY NSW 4.0 2.0 2.0 House 521.0 Buyers Guide $395,000 - $415,00 0 395000.0 41500.0 218250.0
4657 2020-07-26 09:17:54.816136 5031 mcglinchey crescent 5031 mcglinchey crescent 2322 THORNTON NSW 4.0 2.0 2.0 New House & Land 551.0 NOW $6004K WAS $649K 0 6004000.0 649000.0 3326500.0
6059 2020-07-04 08:32:40.852931 canning grove estate canning grove estate 2470 CASINO NSW NaN NaN NaN Vacant land NaN S130,000 - $150.000 0 130000.0 0.0 65000.0

Looking at the above output, we find that most errors are caused by data entry or scraping mistakes, and only few could be fixed by accounting for additional scenarios in our data cleaning process. As a result, we decide to simply drop all columns where price_max < price_min.

In [64]:
# dropping rows where price_max < price_min
properties.drop(properties[properties["price_max"] < properties["price_min"]].index, axis=0, inplace=True)
In [65]:
# resetting index
properties.reset_index(drop=True, inplace=True)
In [66]:
properties.shape
Out[66]:
(8438, 17)
In [67]:
# max for the price_max column
print("Maximum: " + str(properties["price_max"].max()))
Maximum: 10000000.0
In [68]:
properties[properties["price_max"] > 3400000]
Out[68]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size price_og auction price_min price_max price_mean
664 2020-09-08 11:41:21.827729 26 coane street 26 coane street 2291 MEREWETHER NSW 5.0 2.0 4.0 House NaN $3,250,000 - $3,500,000 0 3250000.0 3500000.0 3375000.0
1266 2020-09-02 11:46:08.807927 1/41 denton park drive 1/41 denton park drive 2320 RUTHERFORD NSW 3.0 1.0 1.0 House 390.0 $350,0000 0 3500000.0 3500000.0 3500000.0
1637 2020-08-27 12:13:59.161338 213 pollock avenue 213 pollock avenue 2259 WYONG NSW NaN NaN NaN Development Site NaN $6,500,000 0 6500000.0 6500000.0 6500000.0
2663 2020-08-18 11:45:11.793536 209 segenhoe road 209 segenhoe road 2336 ABERDEEN NSW NaN NaN NaN Farm 941000.0 $3,500,000 - $3,700,000 0 3500000.0 3700000.0 3600000.0
2671 2020-08-18 11:45:08.033645 2624 ridgelands road 'shannandore' 2624 ridgelands road 'shannandore' 2337 SCONE NSW NaN NaN NaN Farm 5886000.0 Price Guide $3,500,000 - $3,800,000 0 3500000.0 3800000.0 3650000.0
3423 2020-08-07 14:29:51.623752 77 dilkera avenue 77 dilkera avenue 2280 VALENTINE NSW 4.0 3.0 3.0 House 796.0 $3,900,000 0 3900000.0 3900000.0 3900000.0
5538 2020-07-15 21:03:46.834520 1342 middlebrook valley lodge 1342 middlebrook valley lodge 2337 SCONE NSW NaN NaN NaN Rural NaN $3,850,000 - $4,150,000 Price Guide 0 3850000.0 4150000.0 4000000.0
5555 2020-07-15 21:03:37.679914 2425 STROUD NSW NaN NaN NaN Rural 39320000.0 $10,000,000 0 10000000.0 10000000.0 10000000.0
6094 2020-07-04 08:32:17.745876 301 tocal road 301 tocal road 2320 MINDARIBBA NSW NaN NaN NaN Specialist Farm 391100.0 Negotiating range $3,600,000-$3,900,000 0 3600000.0 3900000.0 3750000.0
7517 2020-06-17 18:12:28.203199 450 & yarraman 'rockhall' 'gilliburn' 450 & yarraman 'rockhall' 'gilliburn' 2333 WYBONG NSW NaN NaN NaN Rural 5961100.0 $3,600,000 0 3600000.0 3600000.0 3600000.0
8312 2020-06-08 15:26:53.979447 1001/61 shortland esplanade 1001/61 shortland esplanade 2300 NEWCASTLE NSW 3.0 2.0 2.0 Apartment / Unit / Flat NaN Guide $3,750,000 0 3750000.0 3750000.0 3750000.0

Only one of the prices above raises questionmarks - the entry for 1/41 Denton Park Drive in Rutherford appears to be incorrect and should be $350,000 (instead of \\$3,500,000) as a quick check on the property website confirms:

cap1

When we dealt with the questionable minimum values and unrealistic price gaps, the solution we implemented was generic, capable of addressing similar issues in future listings that might be added to the database.

If we were to correct the entry for 1/41 Denton Park Drive, our solution would be specific to this single row in our database, and useless for any other entries. What's more, should our scraper pick up the correct pricing next time it runs, the incorrect entry above would be replaced with the latest record (as per the guidelines we specified for the removal of duplicate lines), and any fixes addressing the line would be useless or could "correct" the listing incorrectly.

At this stage, we decide to leave the entry as is.

In [69]:
# dropping the price_og column
properties.drop("price_og", axis=1, inplace=True)

Initial Exploration

In [70]:
# date range
print(properties["scrape_date"].min())
print(properties["scrape_date"].max())
2020-06-08 15:19:22.385443
2020-09-08 11:48:00.986027
In [71]:
# number of suburbs
properties["suburb"].nunique()
Out[71]:
579
In [72]:
# counts of unique values for the bed column
properties["bed"].value_counts().sort_index()
Out[72]:
0.0       15
1.0      124
2.0     1081
3.0     2849
4.0     2600
5.0      429
6.0       88
7.0       11
8.0        9
9.0        1
10.0       5
11.0       4
12.0       2
14.0       4
18.0       1
20.0       1
30.0       1
34.0       1
Name: bed, dtype: int64
In [73]:
# counts of unique values for the property type column
properties["prop_type"].value_counts()
Out[73]:
House                            4981
New House & Land                  842
Vacant land                       755
Apartment / Unit / Flat           585
Townhouse                         309
Rural                             270
Villa                             246
Retirement Living                 137
Acreage / Semi-Rural              129
Duplex                             47
Semi-Detached                      38
Farm                               18
Block of Units                     17
New Apartments / Off the Plan      14
New land                           12
New Home Designs                    9
Development Site                    8
Specialist Farm                     5
Rural Lifestyle Property            5
Terrace                             5
Studio                              4
Livestock Property                  2
Name: prop_type, dtype: int64
In [74]:
# min, max and mean property size (m²)
print("Minimum Property Size: " + str(properties["clean_size"].min()))
print("Maximum Property Size: " + str(properties["clean_size"].max()))
print("Mean Property Size: " + str(properties["clean_size"].mean()))
Minimum Property Size: 1.0
Maximum Property Size: 40230000.0
Mean Property Size: 59067.91190586778
In [75]:
# total number of auctions
properties["auction"].sum()
Out[75]:
581
In [76]:
# percentage of auctions
(properties["auction"].sum() / len(properties)) * 100
Out[76]:
6.885517895235838
In [77]:
# avg bedrooms, size and price per suburb
grouped = properties.groupby("suburb")[["bed", "clean_size", "price_mean"]]
properties_mean_bsp = round(grouped.agg(np.mean))
properties_mean_bsp["properties_per_group"] = grouped.size()
properties_mean_bsp.reset_index(inplace=True)
properties_mean_bsp
Out[77]:
suburb bed clean_size price_mean properties_per_group
0 ABERDARE 3.0 738.0 359694.0 19
1 ABERDEEN 4.0 86876.0 580077.0 14
2 ABERGLASSLYN 4.0 624.0 531633.0 63
3 ABERMAIN 3.0 1509.0 465404.0 16
4 ABERNETHY 2.0 1012.0 287000.0 2
... ... ... ... ... ...
574 YARRANBELLA NaN 424700.0 295000.0 1
575 YARRAS NaN 4300900.0 NaN 2
576 YARRAVEL 3.0 22467.0 497667.0 3
577 YARRAWONGA PARK 3.0 506.0 449000.0 2
578 YESSABAH NaN 2108400.0 2400000.0 1

579 rows × 5 columns

In [78]:
# function for looking at certain suburbs only
def return_suburb_statistics(suburb_list):
    result = properties_mean_bsp[properties_mean_bsp["suburb"].isin(suburb_list)].sort_values("price_mean", ascending=False)
    return result
In [79]:
# closer look at Newcastle and surrounding suburbs
newcastle_suburbs = ["NEWCASTLE", "NEWCASTLE WEST", "NEWCASTLE EAST", "THE HILL", "COOKS HILL", 
                     "BAR BEACH", "THE JUNCTION"]

return_suburb_statistics(newcastle_suburbs)
Out[79]:
suburb bed clean_size price_mean properties_per_group
494 THE JUNCTION 3.0 360.0 1213333.0 7
373 NEWCASTLE EAST 3.0 246.0 1134000.0 2
372 NEWCASTLE 2.0 179.0 1004462.0 73
124 COOKS HILL 2.0 227.0 814688.0 28
27 BAR BEACH 2.0 641.0 725000.0 5
374 NEWCASTLE WEST 2.0 88.0 709643.0 11
493 THE HILL 3.0 246.0 585946.0 19
In [80]:
# closer look at some Lake Macquarie suburbs
lake_mac_suburbs = ["CHARLESTOWN", "GATESHEAD", "KAHIBAH", "WHITEBRIDGE", "DUDLEY", "REDHEAD", 
                    "BENNETTS GREEN","WINDALE", "TINGIRA HEIGHTS", "WARNERS BAY"]

return_suburb_statistics(lake_mac_suburbs)
Out[80]:
suburb bed clean_size price_mean properties_per_group
149 DUDLEY 3.0 568.0 786500.0 10
532 WARNERS BAY 3.0 693.0 717146.0 37
251 KAHIBAH 3.0 451.0 684778.0 13
546 WHITEBRIDGE 3.0 445.0 669567.0 10
498 TINGIRA HEIGHTS 4.0 1201.0 661900.0 9
108 CHARLESTOWN 3.0 645.0 580986.0 87
199 GATESHEAD 3.0 607.0 459500.0 11
426 REDHEAD 2.0 422.0 451735.0 20
551 WINDALE 2.0 654.0 399750.0 2
In [81]:
# statistics per suburb and prop_type
grouped = properties.groupby(["prop_type", "suburb"])[["bed", "clean_size", "price_mean"]]
properties_mean_prop_type = round(grouped.agg(np.mean))
properties_mean_prop_type["properties_per_group"] = grouped.size()
properties_mean_prop_type
Out[81]:
bed clean_size price_mean properties_per_group
prop_type suburb
Acreage / Semi-Rural ALDAVILLA NaN 10100.0 514500.0 2
ALLANDALE NaN 376400.0 NaN 1
ARAKOON NaN 10750.0 987500.0 2
BEECHWOOD NaN 34300.0 789500.0 1
BENSVILLE NaN 4651.0 1175000.0 1
... ... ... ... ... ...
Villa WARNERS BAY 3.0 NaN 685000.0 1
WAUCHOPE 3.0 253.0 365667.0 3
WOY WOY 3.0 201.0 551750.0 13
WYOMING 2.0 NaN 345000.0 1
WYONG 1.0 NaN 130000.0 1

1456 rows × 4 columns

In [82]:
# function for looking at certain suburbs and prop_types only
def return_proptype_suburb_statistics(suburb_list, prop_type_list):
    result = properties_mean_prop_type[
    (properties_mean_prop_type.index.get_level_values("suburb").isin(suburb_list)) & 
    (properties_mean_prop_type.index.get_level_values("prop_type").isin(prop_type_list))]
    return result
In [83]:
prop_types_to_include = ["House", "Apartment / Unit / Flat"]
In [84]:
# closer look at Newcastle and surrounding suburbs
return_proptype_suburb_statistics(newcastle_suburbs, prop_types_to_include)
Out[84]:
bed clean_size price_mean properties_per_group
prop_type suburb
Apartment / Unit / Flat BAR BEACH 2.0 NaN 440000.0 3
COOKS HILL 1.0 124.0 502857.0 11
NEWCASTLE 2.0 154.0 1014129.0 64
NEWCASTLE EAST 3.0 337.0 NaN 1
NEWCASTLE WEST 2.0 88.0 709643.0 11
THE HILL 2.0 98.0 514386.0 13
THE JUNCTION 2.0 NaN NaN 1
House BAR BEACH 3.0 641.0 1295000.0 2
COOKS HILL 3.0 254.0 880000.0 9
NEWCASTLE 7.0 453.0 NaN 1
THE HILL 3.0 205.0 780000.0 2
THE JUNCTION 4.0 414.0 1442500.0 4
In [85]:
# closer look at some Lake Macquarie suburbs
return_proptype_suburb_statistics(lake_mac_suburbs, prop_types_to_include)
Out[85]:
bed clean_size price_mean properties_per_group
prop_type suburb
Apartment / Unit / Flat CHARLESTOWN 2.0 455.0 511616.0 18
REDHEAD 2.0 NaN NaN 1
WARNERS BAY 2.0 156.0 729000.0 11
WHITEBRIDGE 3.0 204.0 NaN 1
House CHARLESTOWN 4.0 688.0 622426.0 57
DUDLEY 3.0 611.0 814375.0 7
GATESHEAD 3.0 607.0 459500.0 11
KAHIBAH 3.0 477.0 706800.0 9
REDHEAD 3.0 464.0 765786.0 8
TINGIRA HEIGHTS 4.0 1201.0 661900.0 9
WARNERS BAY 4.0 755.0 725045.0 19
WHITEBRIDGE 4.0 598.0 666748.0 5
WINDALE 2.0 654.0 399750.0 2

Geo Location Data

We used the clean address details to get the geo location data for our properties and stored them in a separate database. Let's load the data into our notebook and take a look before merging it with our properties dataframe.

In [86]:
# geo data
allGeoData = pd.read_sql_table("geodata2", con=db)
allGeoData = allGeoData.drop_duplicates(["street", "suburb"])
allGeoData.drop(["id", "state", "postcode"], axis=1, inplace=True)
allGeoData["lat"] = allGeoData["lat"].astype(float)
allGeoData["lng"] = allGeoData["lng"].astype(float)
allGeoData.sample(5)
Out[86]:
formattedname street suburb lat lng
1212 32 Brittany Ave, Rutherford NSW 2320, Australia 32 brittany avenue RUTHERFORD -32.698728 151.517185
3312 Koolang Rd, Green Point NSW 2251, Australia 33/67 site koolang road GREEN POINT -33.450167 151.373656
5675 94 Granite St, Port Macquarie NSW 2444, Australia 94 granite street PORT MACQUARIE -31.446938 152.915236
1682 5 Jersey St, South Kempsey NSW 2440, Australia 5 jersey street SOUTH KEMPSEY -31.088721 152.836745
4128 Not Found 108 gardenia drive MEDOWIE 0.000000 0.000000
In [87]:
# merging the dataframes
merged = pd.merge(properties, allGeoData, left_on=["clean_street", "suburb"], 
                  right_on=["street", "suburb"], how="left")
In [88]:
# dropping the street column as it is basically a duplicate of the clean_street column
merged.drop("street", axis=1, inplace=True)
In [89]:
merged.sample(5)
Out[89]:
scrape_date street_number street_name clean_street postcode suburb state bed bath parking prop_type clean_size auction price_min price_max price_mean formattedname lat lng
6427 2020-07-01 20:23:54.907675 11a grey gum road 11a grey gum road 2328 DENMAN NSW NaN NaN NaN Vacant land 4000.0 0 180000.0 180000.0 180000.0 11 Grey Gum Rd, Denman NSW 2328, Australia -32.381236 150.677396
6619 2020-07-01 20:15:41.857824 24 wailele avenue 24 wailele avenue 2262 BUDGEWOI NSW 2.0 2.0 2.0 House 575.0 0 479000.0 479000.0 479000.0 NaN NaN NaN
6417 2020-07-01 20:24:01.709658 39 squadron crescent 39 squadron crescent 2320 RUTHERFORD NSW 3.0 1.0 2.0 House 585.0 0 400000.0 430000.0 415000.0 39 Squadron Cres, Rutherford NSW 2320, Australia -32.721654 151.521793
4670 2020-07-26 09:17:44.622551 18 arila avenue 18 arila avenue 2260 WAMBERAL NSW 3.0 1.0 2.0 House 651.0 0 NaN NaN NaN 18 Arila Ave, Wamberal NSW 2260, Australia -33.434743 151.442613
4552 2020-07-26 09:18:49.459025 27 rosemeadow drive 27 rosemeadow drive 2259 GWANDALAN NSW 4.0 2.0 2.0 House 536.0 1 NaN NaN NaN Not Found 0.000000 0.000000
In [90]:
# confirming the number of rows in merged is equal to the number of rows in properties
merged.shape
Out[90]:
(8438, 19)

Before we can plot our property geo location data on a map, we first need to clean the new columns, then remove any rows that do not have details in the latitude and longitude columns.

In [91]:
# converting "Not Found" values in the formattedname column to NaN
merged["formattedname"] = merged["formattedname"].replace("Not Found", np.nan)
In [92]:
# converting zeros in the lat and lng columns to NaN
merged["lat"] = merged["lat"].replace(0, np.nan)
merged["lng"] = merged["lng"].replace(0, np.nan)
In [93]:
# creating a new dataframe that only contains rows where lat and lng are not NaN
plot_data = merged[merged["lat"].notnull()]

Plotting the Data

We cleaned and combined our datasets and removed any properties for which latitude and longitude details were unavailable. Next, we will use the resulting dataframe to create our map.

In [94]:
m = folium.Map(location=[plot_data["lat"].mean(), plot_data["lng"].mean()], zoom_start=9, tiles='Stamen Toner')
In [95]:
# creating the colour legend 
import branca

colours = ['#14fff7', '#102aeb', '#77e610', '#faf219', '#fcac0a', '#eb4034']

colormap = branca.colormap.StepColormap(colors=colours, vmin=0, vmax=1200000, caption="Price")
colormap.add_to(m);
In [96]:
def colours_and_circles(data, feature_group):
    
    for _, row in data.iterrows():
        
        # using different colours for different price ranges - black if the price is NaN
        if pd.isnull(row["price_mean"]):
            colour = '#000000'
        elif row["price_mean"] > 1000000:
            colour = '#eb4034'
        elif (row["price_mean"] <= 1000000) & (row["price_mean"] > 800000):
            colour = '#fcac0a'
        elif (row["price_mean"] <= 800000) & (row["price_mean"] > 600000):
            colour = '#faf219'
        elif (row["price_mean"] <= 600000) & (row["price_mean"] > 400000):
            colour = '#77e610'
        elif (row["price_mean"] <= 400000) & (row["price_mean"] > 200000):
            colour = '#102aeb'    
        else:
            colour = '#14fff7'
        
        # setting different radius values based on the m² size of the property
        if pd.isnull(row["clean_size"]):
            r = 2
        elif row["clean_size"] >= 5000:
            r = 16
        elif (row["clean_size"] < 5000) & (row["clean_size"] >= 2500):
            r = 14
        elif (row["clean_size"] < 2500) & (row["clean_size"] >= 1800):
            r = 12
        elif (row["clean_size"] < 1800) & (row["clean_size"] >= 1000):
            r = 10
        elif (row["clean_size"] < 1000) & (row["clean_size"] >= 750):
            r = 7   
        elif (row["clean_size"] < 750) & (row["clean_size"] >= 500):
            r = 5  
        else:
            r = 2   
        
        folium.CircleMarker(
            location=[row["lat"], row["lng"]],
            radius=r,
            # adding a popup label to each data point with the address, property type, 
            # number of bedrooms, size in m² and the estimated price
            popup=folium.Popup(str(row["clean_street"]).title() + "<br>" + str(row["suburb"]) + " " + str(row["state"]) + " " + str(row["postcode"]) + "<br>Property Type: " + str(row["prop_type"]) + "<br>Bedrooms: " + str(row["bed"]) + "<br>Size: " + str(row["clean_size"]) + "<br>Estimated Price: " + str(row["price_mean"]), max_width=250),
            color=colour,
            fill=True,
            fill_color=colour
        ).add_to(feature_group)
In [97]:
# creating different layers for our map so we can filter which properties we want to see based
# on number of bedrooms
feature_group_0 = folium.FeatureGroup(name="unknown number of bedrooms")
unknown_bedrooms = plot_data[pd.isnull(plot_data["bed"])]
colours_and_circles(unknown_bedrooms, feature_group_0)
In [98]:
feature_group_1 = folium.FeatureGroup(name="less than two bedrooms")
few_bedrooms = plot_data[plot_data["bed"] < 2]
colours_and_circles(few_bedrooms, feature_group_1)
In [99]:
feature_group_2 = folium.FeatureGroup(name="two bedrooms")
two_bedrooms = plot_data[plot_data["bed"] == 2]
colours_and_circles(two_bedrooms, feature_group_2)
In [100]:
feature_group_3 = folium.FeatureGroup(name="three bedrooms")
three_bedrooms = plot_data[plot_data["bed"] == 3]
colours_and_circles(three_bedrooms, feature_group_3)
In [101]:
feature_group_4 = folium.FeatureGroup(name="four bedrooms")
four_bedrooms = plot_data[plot_data["bed"] == 4]
colours_and_circles(four_bedrooms, feature_group_4)
In [102]:
feature_group_5 = folium.FeatureGroup(name="five bedrooms")
five_bedrooms = plot_data[plot_data["bed"] == 5]
colours_and_circles(five_bedrooms, feature_group_5)
In [103]:
feature_group_6 = folium.FeatureGroup(name="more than five bedrooms")
more_bedrooms = plot_data[plot_data["bed"] > 5]
colours_and_circles(more_bedrooms, feature_group_6)
In [104]:
feature_group_0.add_to(m)
feature_group_1.add_to(m)
feature_group_2.add_to(m)
feature_group_3.add_to(m)
feature_group_4.add_to(m)
feature_group_5.add_to(m)
feature_group_6.add_to(m)
folium.LayerControl().add_to(m)
Out[104]:
<folium.map.LayerControl at 0x11fbc0550>
In [105]:
m
Out[105]:
Make this Notebook Trusted to load map: File -> Trust Notebook